Render this script with > sbatch ~/spinal_cord_paper/scripts/Seurat_Gg_NT_int_array.sh

load the data

the following workflow is based on: - [https://github.com/satijalab/seurat/issues/1500] Cell cycle regression - [https://github.com/satijalab/seurat/issues/1836] General workflow - [https://github.com/satijalab/seurat/issues/2590] Integrate all features (although used in the integration script)

library(Seurat)
library(dplyr)
library(Rtsne)
library(RColorBrewer)
library(tidyr)
library(ggplot2)
library(gridExtra)
library(patchwork)
library(cowplot)
library(ggdendro)
gnames <- modplots::gnames
# load the integrated seurat
my.int <- readRDS(file = "~/spinal_cord_paper/data/Gg_devel_integrated_070223.rds")

my.int@project.name <- "Gg_devel_int"

my.int

Date: 07.02.23

inspect

We order the plot by nCount_RNA to prevent overplotting of the last sample (default order is the group.by ident).

# have a look at the PC and the previously calculated UMAP
PCAPlot(
  my.int,
  group.by = "orig.ident",
  order = "nCount_RNA"
  )

DimPlot(
  my.int,
  reduction = "umap",
  group.by = "orig.ident",
  order = "nCount_RNA"
  )

cell cycle scoring

Now we can score the cell cycle stage, using Seurats function. For this wee need the ortholog of the distributed cell stage marker lists.

We do the CC.difference calculation at this stage - and not prior to integration - based on the suggestion from: [https://github.com/satijalab/seurat/issues/1500]

We add the scores of stages S and G2M, as well as the difference between them to the metadata with the names: S, G2M, CC.Difference

# Load the orhtology table
ortho_gg_mm_v102 <- readRDS("~/spinal_cord_paper/data/ortho_gg_mm_v102.rds") 

colnames(ortho_gg_mm_v102) <- c(
  "GG_gene_ID",
  "GG_gene_Name",
  "MM_gene_ID",
  "MM_gene_Name",
  "ortho_conf",
  "homolog_type"
)

# A list of cell cycle markers, from Tirosh et al, 2015, is loaded with Seurat.  We can
# segregate this list into markers of G2/M phase and markers of S phase
s.genes <- ortho_gg_mm_v102 %>%
  dplyr::mutate(MM_gene_Name = toupper(MM_gene_Name)) %>%
  dplyr::filter(MM_gene_Name %in% cc.genes$s.genes) %>%
  dplyr::arrange(match(MM_gene_Name, cc.genes$s.genes)) %>%
  dplyr::pull(GG_gene_ID)

g2m.genes <- ortho_gg_mm_v102 %>%
  dplyr::mutate(MM_gene_Name = toupper(MM_gene_Name)) %>%
  dplyr::filter(MM_gene_Name %in% cc.genes$g2m.genes) %>%
  dplyr::arrange(match(MM_gene_Name, cc.genes$g2m.genes)) %>%
  dplyr::pull(GG_gene_ID) 

t0 <- Sys.time()
my.int <- CellCycleScoring(
  my.int,
  s.features = s.genes,
  g2m.features = g2m.genes,
  set.ident = TRUE
  )
paste0("Seurats CC scoring took ", Sys.time() - t0, " seconds to run.")

my.int$CC.Difference.seurat <- my.int$S.Score - my.int$G2M.Score
# view cell cycle scores and phase assignments
head(my.int[[]])

rm(t0, s.genes, g2m.genes)

PCA plot of non-regressed data

PCAPlot(my.int, group.by = "Phase")

rescale integrated assay

We rescale the data to regress out the CC.difference

all_genes <- rownames(my.int)

my.int <- ScaleData(
  my.int,
  features = all_genes,
  vars.to.regress = "CC.Difference.seurat"
)
Regressing out CC.Difference.seurat
Centering and scaling data matrix

PCA

# Run the actual PCA (by default uses var.features of assay)
my.int <- RunPCA(my.int, verbose = F)

PCAPlot(my.int, group.by = "Phase")

# Which dimensions will we choose?
hist(my.int@reductions$pca@stdev^2, breaks = 500)

my.dimensions=1:20

Dim Reduction

We will run the tSNE. Using the FFT-accelerated Interpolation-based t-SNE (FIt-SNE). We run two, a normal tSNE, and an exaggerated tSNE, where clusters are tighter togheter. This is located under the xtsne name.

# Get the tSNE function
source("~/Software/FIt-SNE-master/fast_tsne.R")

my.tsne = fftRtsne(my.int@reductions$pca@cell.embeddings[,my.dimensions],
                   max_iter = 1000,
                   learning_rate =  round(dim(my.int)[2]/12),
                   initialization =(my.int@reductions$pca@cell.embeddings[,1:2]/my.int@reductions$pca@stdev[1])*0.0001,
                   perplexity_list = c(30, round(dim(my.int)[2]/100)),
                   fast_tsne_path="~/Software/FIt-SNE-master/bin/fast_tsne")

colnames(my.tsne) = c("tsne_1", "tsne_2")
rownames(my.tsne) = colnames(my.int)

my.int[["tsne"]] = CreateDimReducObject(embeddings = my.tsne, key = "tsne_", assay = DefaultAssay(my.int), global = T)

Clustering

Here we perform the Louvain-jaccard clustering implemented in Seurat. We can see the tree of clusters, to see how the clusters relate in the PCA space.
We also use the tree, to check if any of the terminal pairs of sisters should be merged. This is determined based on a minimum of 20 DEs between the clusters.

# Find first the nearest neighbors
my.int <- FindNeighbors(object = my.int, dims=my.dimensions, verbose = F)

# Then the actual clusters
my.int <- FindClusters(object = my.int, resolution = 1, verbose = F, random.seed = 42)

# Check the tree of clusters, to see what's the relationship between them
my.int <- BuildClusterTree(my.int, dims = my.dimensions, verbose = F)
plot(Tool(object = my.int, slot = 'BuildClusterTree'))

# We are gonna check for DEs using the non integrated data. We are only gonna test genes that have variability, so we calculate variable genes
my.int <- FindVariableFeatures(my.int, assay = "RNA")


# Only the genes with variability > median
my.HVF <- HVFInfo(my.int, assay = "RNA")
my.HVF <- rownames(my.HVF)[which(my.HVF[,3] > (median(my.HVF[,3])))]
# We check for pairs of clusters, how mayn DEs they have. If less than n, we merge them
keep.check <- T
while (keep.check == T) {
  
  keep.check <- F
  # Check the tree of clusters, to see what's the relationship between them
  my.int <- BuildClusterTree(my.int, dims = my.dimensions, verbose = F)
  # Check only the terminal sisters
  to.check = ips::terminalSisters(my.int@tools$BuildClusterTree)
  for (i in to.check) {
    # DE between the sisters
    my.DE <- FindMarkers(my.int, i[1], i[2], test.use = "MAST", latent.vars = c("CC.Difference.seurat"),
                         min.pct = 0.25, verbose = T, assay = "RNA", features = my.HVF,) %>%
      dplyr::filter(abs(avg_log2FC) > 0.5) %>%
      dplyr::filter(p_val_adj < 0.05)
    
    # If less than 5, merge, and repeat
    if (dim(my.DE)[1] < 5) {
      cat(paste0(dim(my.DE)[1], " genes differentially expressed between clusters ",i[1]," and ",i[2]," merging \n"))
      my.int <- SetIdent(my.int, cells = WhichCells(my.int, idents = i[2]), value = i[1])
      keep.check <- T
    }
    print(i)
  }
}
rm(to.check, my.HVF, my.DE)
# renumber starting from 1
my.ID <- factor(
  Idents(my.int),
  levels= levels(Idents(my.int))[base::order(as.numeric(levels(Idents(my.int))))])
levels(my.ID) <- seq(length(levels(my.ID)))
Idents(my.int) <- my.ID
my.int[["seurat_clusters"]] <- my.ID

# Check again the clusters
dim1 <- DimPlot(my.int, reduction = "tsne", cols = rainbow(length(levels(my.ID))), label = T, label.size = 5, pt.size = 1)
dim1

Plot the tree again

# tree based on PCA dims
my.int <- BuildClusterTree(my.int, dims = my.dimensions, verbose = F)
plot(Tool(object = my.int, slot = 'BuildClusterTree'))

Now that we have the final reductions, we’ll choose one and we look at the statistics of the cells.

modplots::mFeaturePlot(my.int, my.features = c("OLIG2", "SOX9", "TUBB3", "SHH", "SLC18A3", "GATA2", "PAX2", "TLX3", "IGFBP7", "HBBA", "IFI30", "MSX2", "PLP1"), gnames = gnames)

# set and get dim.reduct embeddings
my.reduc <- "tsne"
emb <- data.frame(Embeddings(my.int, my.reduc))
colnames(emb) <- c("reduc_1", "reduc_2")

meta <- my.int[[]] %>%
  tibble::rownames_to_column("cell_ID")


my.plots = list()

my.plots[[1]] = ggplot(emb[meta[order(meta$nCount_RNA),]$cell_ID, ], aes(x = reduc_1, y = reduc_2)) +
    geom_point(aes(color=sort(meta$nCount_RNA)), size=1, alpha = 0.4, pch = 19) + 
    scale_colour_gradientn(colours = c("gray90","gray90","gray80","yellow", "orange", "red", "darkred", "darkred")) +
    theme_classic() + labs(colour="UMI count", x = paste0(my.reduc, "_1"), y = paste0(my.reduc, "_2"))

my.plots[[2]] = ggplot(emb[meta[order(meta$nFeature_RNA),]$cell_ID, ], aes(x = reduc_1, y = reduc_2)) +
    geom_point(aes(color=sort(meta$nFeature_RNA)), size=1, alpha = 0.4, pch = 19) + 
    scale_colour_gradientn(colours = c("gray90","gray90","gray80","yellow", "orange", "red", "darkred", "darkred")) +
    theme_classic() + labs(colour="gene count", x = paste0(my.reduc, "_1"), y = paste0(my.reduc, "_2"))

my.plots[[3]] = ggplot(emb[meta[order(meta$nCount_SCT),]$cell_ID, ], aes(x = reduc_1, y = reduc_2)) +
    geom_point(aes(color=sort(meta$nCount_RNA)), size=1, alpha = 0.4, pch = 19) + 
    scale_colour_gradientn(colours = c("gray90","gray90","gray80","yellow", "orange", "red", "darkred", "darkred")) +
    theme_classic() + labs(colour="UMI count (SCT)", x = paste0(my.reduc, "_1"), y = paste0(my.reduc, "_2"))

my.plots[[4]] = ggplot(emb[meta[order(meta$nFeature_SCT),]$cell_ID, ], aes(x = reduc_1, y = reduc_2)) +
    geom_point(aes(color=sort(meta$nFeature_RNA)), size=1, alpha = 0.4, pch = 19) + 
    scale_colour_gradientn(colours = c("gray90","gray90","gray80","yellow", "orange", "red", "darkred", "darkred")) +
    theme_classic() + labs(colour="gene count (SCT)", x = paste0(my.reduc, "_1"), y = paste0(my.reduc, "_2"))

my.plots[[5]] = ggplot(emb[meta[order(meta$percent.mt),]$cell_ID, ], aes(x = reduc_1, y = reduc_2)) +
    geom_point(aes(color=(sort(meta$percent.mt))), size=1, alpha = 0.4, pch = 19) + 
    scale_colour_gradientn(colours = c("gray90","gray80","yellow", "orange", "red", "darkred", "darkred")) +
    theme_classic() + labs(colour="log1p MT percent", x = paste0(my.reduc, "_1"), y = paste0(my.reduc, "_2"))

my.plots[[6]] = ggplot(emb[meta[order(meta$CC.Difference.seurat),]$cell_ID, ], aes(x = reduc_1, y = reduc_2)) +
    geom_point(aes(color=sort(meta$CC.Difference.seurat)), size=1, alpha = 0.4, pch = 19) + 
    scale_colour_gradientn(colours = c("gray90","gray90","gray80","yellow", "orange", "red", "darkred", "darkred")) +
    theme_classic() + labs(colour="Cell Cycle\nS-G2M", x = paste0(my.reduc, "_1"), y = paste0(my.reduc, "_2"))

my.plots[[7]] = ggplot(emb[meta[order(meta$percent.rb),]$cell_ID, ], aes(x = reduc_1, y = reduc_2)) +
    geom_point(aes(color=sort(meta$percent.rb)), size=2, alpha = 0.4, pch = 19) + 
    scale_colour_gradientn(colours = c("gray90","gray90","gray80","yellow", "orange", "red", "darkred", "darkred")) +
    theme_classic() + labs(colour="percent.rb")


grid.arrange(grobs=my.plots, ncol=2)

dim.orig.ident <- ggplot(emb, aes(x = reduc_1, y = reduc_2)) +
    geom_point(aes(color=meta$orig.ident[sample(x = seq(nrow(emb)), size = nrow(emb), replace = FALSE)]), size=0.8, alpha = 0.6, pch = 19) + 
    scale_colour_manual(values = rainbow(length(table(my.int@meta.data$orig.ident))) ) +
    theme_classic() + labs(colour="Dataset", x = paste0(my.reduc, "_1"), y = paste0(my.reduc, "_2"))

dim.orig.ident

DV domain plots

To identify the different DV domains of the neuron and progenitor clusters, we plot their specific markers.

neurons <- list(dI1 = c("LHX2","LHX9","BARHL1","BARHL2","POU4F1"),
  dI2 = c("LHX1","LHX5","POU4F1"),
  dI3 = c("ISL1","TLX3","DRGX","POU4F1"),
  dI4 = c("LBX1","PAX2","LHX1","LHX5"),
  dI5 = c("LBX1","LMX1B","TLX3","DRGX","POU4F1"),
  dI6 = c("LBX1","PAX2","LHX1","LHX5"),
  V0 = c("EVX1","PAX2","LHX1","LHX5"),
  V1 = c("EN1","PAX2","LHX1","LHX5"),
  V2a = c("VSX1","SOX14","LHX3"),
  V2b = c("GATA2","GATA3","TAL1"),
  MN =  c("MNX1","ISL1","LHX3","ISL2", "SLC18A3"))

prog <- list(dp1_3 =  c("PAX6","IRX3","IRX5","MSX1","PAX3","PAX7"),
  dp4 = c("PAX6","IRX3","IRX5","GSX1","PAX3","PAX7"),
  dp5 = c("PAX6","IRX3","IRX5","DBX2","GSX1","PAX3","PAX7"),
  dp6 = c("PAX6","IRX3","IRX5","DBX2","LEUTX","PAX3","PAX7"),
  p0 = c("PAX6","IRX3","IRX5","DBX2","LEUTX"),
  p1 = c("PAX6","IRX3","IRX5","DBX2","PRDM12"),
  p2 = c("PAX6","IRX3","IRX5","FOXN4","NKX6-1"),
  pMN = c("PAX6","OLIG2","NKX6-1"),
  p3 = c("NKX2-8","NKX2-2","NKX6-1"))

# allows to use mFeaturePlot with lapply
feat_list_plot <- function(x) {
 plot <- modplots::mFeaturePlot(my.int, my.features = x,
                       gnames = gnames, size = 0.2, return = TRUE)
 return(plot)
}

tsne_dim <- TSNEPlot(
  my.int,
  reduction = "tsne",
  cols = rainbow(length(levels(my.ID))),
  pt.size = 0.01,
  label.size = 1,
  label = TRUE
) +
  ggplot2::theme(legend.position = "none")

plots_prog <- lapply(prog, feat_list_plot)
plots_neur <- lapply(neurons, feat_list_plot)
Not all features are present in your object! Removing:  LHX9 BARHL2 
Not all features are present in your object! Removing:  LHX5 
Not all features are present in your object! Removing:  DRGX 
Not all features are present in your object! Removing:  LBX1 LHX5 
Not all features are present in your object! Removing:  LBX1 DRGX 
Not all features are present in your object! Removing:  LBX1 LHX5 
Not all features are present in your object! Removing:  LHX5 
Not all features are present in your object! Removing:  LHX5 
Not all features are present in your object! Removing:  MNX1 ISL2 
pdf(paste0("~/spinal_cord_paper/figures/DV_prog_domain_",my.int@project.name ,".pdf"), width = 7, height = 5)
  for (j in names(prog)) {
    plots_prog[[j]][["tsne"]] <- tsne_dim
    gridExtra::grid.arrange(grobs = plots_prog[[j]], ncol = 3, top = j)
  }
dev.off()
png 
  2 
pdf(paste0("~/spinal_cord_paper/figures/DV_neur_domain_",my.int@project.name ,".pdf"), width = 7, height = 5)
  for (j in names(neurons)) {
    plots_neur[[j]][["tsne"]] <- tsne_dim
    gridExtra::grid.arrange(grobs = plots_neur[[j]], ncol = 3, top = j)
  }
  dev.off()
png 
  2 

DE analysis

Here we do the differential expression analysis, and end up with the marker genes lists. We can also see the marker gene dot plot, for the top 2 marker genes per cluster

# Find all the marker genes, with these thresholds MAST
semarkers = FindAllMarkers(my.int,
                           features = my.int[["integrated"]]@var.features, 
                           only.pos = TRUE, 
                           min.pct = 0.25, 
                           logfc.threshold =  0.5,
                           latent.vars = c("CC.Difference.seurat"), 
                           test.use = "MAST", 
                           assay = "RNA", 
                           return.thresh = 0.05)

# We only keep the significant ones
semarkers <- semarkers %>%
  filter(p_val_adj < 0.05) %>% 
  rename(Gene.stable.ID = gene) %>% 
  left_join(gnames, by = "Gene.stable.ID")


# Take only the top 10
semrk10 = semarkers %>% group_by(cluster) %>% top_n(-10, p_val_adj)
semrk1 = semarkers %>% group_by(cluster) %>% top_n(-1, p_val_adj)

modplots::mDotPlot2(my.int, 
          features = unique(semrk1$Gene.stable.ID), 
          cols = c("grey", "black"),  
          gnames = gnames, dot.scale = 6) +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

DoHeatmap(my.int, semrk1$Gene.stable.ID)

save

saveRDS(my.int, paste0("~/spinal_cord_paper/data/", my.int@project.name, "_seurat_",format(Sys.Date(), "%d%m%y"),".rds"))

write.table(semarkers, sep = "\t", row.names = T, col.names = T,
            file = paste0("~/spinal_cord_paper/data/", my.int@project.name, "_fullDE_",format(Sys.Date(), "%d%m%y"),".txt"), quote = F)

write.table(semrk10, sep = "\t", row.names = T, col.names = T,
            file = paste0("~/spinal_cord_paper/data/", my.int@project.name, "_top10DE_",format(Sys.Date(), "%d%m%y"),".txt"), quote = F)
# Date and time of Rendering
Sys.time()
[1] "2023-02-07 14:04:19 CET"
sessionInfo()
R version 4.1.0 (2021-05-18)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: CentOS Linux 7 (Core)

Matrix products: default
BLAS/LAPACK: /scicore/soft/apps/OpenBLAS/0.3.1-GCC-7.3.0-2.30/lib/libopenblas_sandybridgep-r0.3.1.so

locale:
[1] en_US.UTF-8

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] ggdendro_0.1.22    cowplot_1.1.1      patchwork_1.1.1    gridExtra_2.3     
 [5] ggplot2_3.3.3      tidyr_1.1.3        RColorBrewer_1.1-2 Rtsne_0.15        
 [9] dplyr_1.0.10       SeuratObject_4.0.2 Seurat_4.0.5      

loaded via a namespace (and not attached):
  [1] fastmatch_1.1-0             plyr_1.8.6                 
  [3] igraph_1.2.6                lazyeval_0.2.2             
  [5] sp_1.4-5                    splines_4.1.0              
  [7] listenv_0.8.0               scattermore_0.7            
  [9] GenomeInfoDb_1.28.0         digest_0.6.27              
 [11] htmltools_0.5.1.1           ips_0.0.11                 
 [13] fansi_0.5.0                 magrittr_2.0.1             
 [15] memoise_2.0.0               tensor_1.5                 
 [17] cluster_2.1.2               ROCR_1.0-11                
 [19] globals_0.16.2              Biostrings_2.60.0          
 [21] matrixStats_0.58.0          modplots_1.0.0             
 [23] spatstat.sparse_3.0-0       prettyunits_1.1.1          
 [25] colorspace_2.0-1            blob_1.2.1                 
 [27] ggrepel_0.9.1               xfun_0.34                  
 [29] crayon_1.4.1                RCurl_1.98-1.3             
 [31] jsonlite_1.7.2              spatstat.data_3.0-0        
 [33] phangorn_2.7.0              ape_5.5                    
 [35] survival_3.2-11             zoo_1.8-9                  
 [37] glue_1.6.2                  polyclip_1.10-0            
 [39] gtable_0.3.0                zlibbioc_1.38.0            
 [41] XVector_0.32.0              leiden_0.3.9               
 [43] DelayedArray_0.18.0         SingleCellExperiment_1.14.1
 [45] future.apply_1.7.0          BiocGenerics_0.38.0        
 [47] abind_1.4-5                 scales_1.1.1               
 [49] pheatmap_1.0.12             DBI_1.1.1                  
 [51] miniUI_0.1.1.1              Rcpp_1.0.7                 
 [53] progress_1.2.2              viridisLite_0.4.0          
 [55] xtable_1.8-4                reticulate_1.22            
 [57] spatstat.core_2.1-2         bit_4.0.4                  
 [59] stats4_4.1.0                htmlwidgets_1.5.3          
 [61] httr_1.4.2                  ellipsis_0.3.2             
 [63] ica_1.0-2                   XML_3.99-0.6               
 [65] farver_2.1.0                pkgconfig_2.0.3            
 [67] sass_0.4.0                  uwot_0.1.10                
 [69] deldir_1.0-6                utf8_1.2.1                 
 [71] labeling_0.4.2              tidyselect_1.2.0           
 [73] rlang_1.0.6                 reshape2_1.4.4             
 [75] later_1.2.0                 AnnotationDbi_1.54.0       
 [77] munsell_0.5.0               tools_4.1.0                
 [79] cachem_1.0.5                cli_3.4.1                  
 [81] generics_0.1.3              RSQLite_2.2.7              
 [83] ggridges_0.5.3              org.Gg.eg.db_3.13.0        
 [85] evaluate_0.20               stringr_1.4.0              
 [87] fastmap_1.1.0               yaml_2.2.1                 
 [89] goftest_1.2-2               knitr_1.41                 
 [91] bit64_4.0.5                 fitdistrplus_1.1-6         
 [93] purrr_0.3.4                 RANN_2.6.1                 
 [95] KEGGREST_1.32.0             pbapply_1.4-3              
 [97] future_1.30.0               nlme_3.1-152               
 [99] mime_0.10                   compiler_4.1.0             
[101] plotly_4.10.0               png_0.1-7                  
[103] spatstat.utils_3.0-1        tibble_3.1.8               
[105] bslib_0.2.5.1               stringi_1.6.2              
[107] highr_0.9                   lattice_0.20-44            
[109] Matrix_1.3-3                vctrs_0.5.1                
[111] pillar_1.8.1                lifecycle_1.0.3            
[113] spatstat.geom_3.0-3         lmtest_0.9-38              
[115] jquerylib_0.1.4             RcppAnnoy_0.0.19           
[117] data.table_1.14.0           bitops_1.0-7               
[119] irlba_2.3.3                 GenomicRanges_1.44.0       
[121] httpuv_1.6.1                R6_2.5.0                   
[123] promises_1.2.0.1            KernSmooth_2.23-20         
[125] IRanges_2.26.0              parallelly_1.33.0          
[127] codetools_0.2-18            MASS_7.3-54                
[129] assertthat_0.2.1            MAST_1.18.0                
[131] SummarizedExperiment_1.22.0 withr_2.4.2                
[133] sctransform_0.3.3           S4Vectors_0.30.0           
[135] GenomeInfoDbData_1.2.6      hms_1.1.0                  
[137] mgcv_1.8-35                 parallel_4.1.0             
[139] quadprog_1.5-8              grid_4.1.0                 
[141] rpart_4.1-15                rmarkdown_2.17             
[143] MatrixGenerics_1.4.0        Biobase_2.52.0             
[145] shiny_1.6.0